In [1]:
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
%matplotlib inline
import pandas_profiling as pp
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob
In [82]:
pp.ProfileReport(Account_Table)
Out[82]:

In [ ]:
 
In [ ]:
 
In [2]:
df = pd.read_csv(r'C:\Users\nitishkumar\Desktop\P\Cred\Data\Combined_Data\combined_final_sheet.csv')
In [3]:
df
Out[3]:
report.ACCOUNTS-SUMMARY.DERIVED-ATTRIBUTES.AVERAGE-ACCOUNT-AGE-MONTH 9 user_10
0 report.ACCOUNTS-SUMMARY.DERIVED-ATTRIBUTES.AVE... 0 user_10
1 report.ACCOUNTS-SUMMARY.DERIVED-ATTRIBUTES.INQ... 0 user_10
2 report.ACCOUNTS-SUMMARY.DERIVED-ATTRIBUTES.LEN... 0 user_10
3 report.ACCOUNTS-SUMMARY.DERIVED-ATTRIBUTES.LEN... 1 user_10
4 report.ACCOUNTS-SUMMARY.DERIVED-ATTRIBUTES.NEW... 0 user_10
... ... ... ...
4936166 report.SCORES.SCORE.SCORE-COMMENTS A-Very Low Risk user_99997
4936167 report.SCORES.SCORE.SCORE-TYPE PERFORM CONSUMER 2.0 user_99997
4936168 report.SCORES.SCORE.SCORE-VALUE 824 user_99997
4936169 report.SECONDARY-MATCHES NaN user_99997
4936170 report.request_type customized_request_refresh user_99997

4936171 rows × 3 columns

In [4]:
df.shape
Out[4]:
(4936171, 3)
In [5]:
df.rename(columns={df.columns[0]:'A',df.columns[1]:'G'}, inplace=True)
In [6]:
df
Out[6]:
A G user_10
0 report.ACCOUNTS-SUMMARY.DERIVED-ATTRIBUTES.AVE... 0 user_10
1 report.ACCOUNTS-SUMMARY.DERIVED-ATTRIBUTES.INQ... 0 user_10
2 report.ACCOUNTS-SUMMARY.DERIVED-ATTRIBUTES.LEN... 0 user_10
3 report.ACCOUNTS-SUMMARY.DERIVED-ATTRIBUTES.LEN... 1 user_10
4 report.ACCOUNTS-SUMMARY.DERIVED-ATTRIBUTES.NEW... 0 user_10
... ... ... ...
4936166 report.SCORES.SCORE.SCORE-COMMENTS A-Very Low Risk user_99997
4936167 report.SCORES.SCORE.SCORE-TYPE PERFORM CONSUMER 2.0 user_99997
4936168 report.SCORES.SCORE.SCORE-VALUE 824 user_99997
4936169 report.SECONDARY-MATCHES NaN user_99997
4936170 report.request_type customized_request_refresh user_99997

4936171 rows × 3 columns

In [7]:
df['A'] = df['A'].map(lambda x: x.lstrip('report.'))
In [8]:
df
Out[8]:
A G user_10
0 ACCOUNTS-SUMMARY.DERIVED-ATTRIBUTES.AVERAGE-AC... 0 user_10
1 ACCOUNTS-SUMMARY.DERIVED-ATTRIBUTES.INQURIES-I... 0 user_10
2 ACCOUNTS-SUMMARY.DERIVED-ATTRIBUTES.LENGTH-OF-... 0 user_10
3 ACCOUNTS-SUMMARY.DERIVED-ATTRIBUTES.LENGTH-OF-... 1 user_10
4 ACCOUNTS-SUMMARY.DERIVED-ATTRIBUTES.NEW-ACCOUN... 0 user_10
... ... ... ...
4936166 SCORES.SCORE.SCORE-COMMENTS A-Very Low Risk user_99997
4936167 SCORES.SCORE.SCORE-TYPE PERFORM CONSUMER 2.0 user_99997
4936168 SCORES.SCORE.SCORE-VALUE 824 user_99997
4936169 SECONDARY-MATCHES NaN user_99997
4936170 quest_type customized_request_refresh user_99997

4936171 rows × 3 columns

In [9]:
new = df['A'].str.split('.', expand=True)
In [10]:
new
Out[10]:
0 1 2 3 4 5
0 ACCOUNTS-SUMMARY DERIVED-ATTRIBUTES AVERAGE-ACCOUNT-AGE-YEAR None None None
1 ACCOUNTS-SUMMARY DERIVED-ATTRIBUTES INQURIES-IN-LAST-SIX-MONTHS None None None
2 ACCOUNTS-SUMMARY DERIVED-ATTRIBUTES LENGTH-OF-CREDIT-HISTORY-MONTH None None None
3 ACCOUNTS-SUMMARY DERIVED-ATTRIBUTES LENGTH-OF-CREDIT-HISTORY-YEAR None None None
4 ACCOUNTS-SUMMARY DERIVED-ATTRIBUTES NEW-ACCOUNTS-IN-LAST-SIX-MONTHS None None None
... ... ... ... ... ... ...
4936166 SCORES SCORE SCORE-COMMENTS None None None
4936167 SCORES SCORE SCORE-TYPE None None None
4936168 SCORES SCORE SCORE-VALUE None None None
4936169 SECONDARY-MATCHES None None None None None
4936170 quest_type None None None None None

4936171 rows × 6 columns

In [11]:
new.shape
Out[11]:
(4936171, 6)
In [12]:
df_final = pd.concat([new,df ], axis = 1)
In [13]:
df_final.drop(['A'], axis = 1) 
Out[13]:
0 1 2 3 4 5 G user_10
0 ACCOUNTS-SUMMARY DERIVED-ATTRIBUTES AVERAGE-ACCOUNT-AGE-YEAR None None None 0 user_10
1 ACCOUNTS-SUMMARY DERIVED-ATTRIBUTES INQURIES-IN-LAST-SIX-MONTHS None None None 0 user_10
2 ACCOUNTS-SUMMARY DERIVED-ATTRIBUTES LENGTH-OF-CREDIT-HISTORY-MONTH None None None 0 user_10
3 ACCOUNTS-SUMMARY DERIVED-ATTRIBUTES LENGTH-OF-CREDIT-HISTORY-YEAR None None None 1 user_10
4 ACCOUNTS-SUMMARY DERIVED-ATTRIBUTES NEW-ACCOUNTS-IN-LAST-SIX-MONTHS None None None 0 user_10
... ... ... ... ... ... ... ... ...
4936166 SCORES SCORE SCORE-COMMENTS None None None A-Very Low Risk user_99997
4936167 SCORES SCORE SCORE-TYPE None None None PERFORM CONSUMER 2.0 user_99997
4936168 SCORES SCORE SCORE-VALUE None None None 824 user_99997
4936169 SECONDARY-MATCHES None None None None None NaN user_99997
4936170 quest_type None None None None None customized_request_refresh user_99997

4936171 rows × 8 columns

In [14]:
df_final.columns = ['x','a','b','c','d','e','f','G','U_USER_ID']
In [15]:
df_final
Out[15]:
x a b c d e f G U_USER_ID
0 ACCOUNTS-SUMMARY DERIVED-ATTRIBUTES AVERAGE-ACCOUNT-AGE-YEAR None None None ACCOUNTS-SUMMARY.DERIVED-ATTRIBUTES.AVERAGE-AC... 0 user_10
1 ACCOUNTS-SUMMARY DERIVED-ATTRIBUTES INQURIES-IN-LAST-SIX-MONTHS None None None ACCOUNTS-SUMMARY.DERIVED-ATTRIBUTES.INQURIES-I... 0 user_10
2 ACCOUNTS-SUMMARY DERIVED-ATTRIBUTES LENGTH-OF-CREDIT-HISTORY-MONTH None None None ACCOUNTS-SUMMARY.DERIVED-ATTRIBUTES.LENGTH-OF-... 0 user_10
3 ACCOUNTS-SUMMARY DERIVED-ATTRIBUTES LENGTH-OF-CREDIT-HISTORY-YEAR None None None ACCOUNTS-SUMMARY.DERIVED-ATTRIBUTES.LENGTH-OF-... 1 user_10
4 ACCOUNTS-SUMMARY DERIVED-ATTRIBUTES NEW-ACCOUNTS-IN-LAST-SIX-MONTHS None None None ACCOUNTS-SUMMARY.DERIVED-ATTRIBUTES.NEW-ACCOUN... 0 user_10
... ... ... ... ... ... ... ... ... ...
4936166 SCORES SCORE SCORE-COMMENTS None None None SCORES.SCORE.SCORE-COMMENTS A-Very Low Risk user_99997
4936167 SCORES SCORE SCORE-TYPE None None None SCORES.SCORE.SCORE-TYPE PERFORM CONSUMER 2.0 user_99997
4936168 SCORES SCORE SCORE-VALUE None None None SCORES.SCORE.SCORE-VALUE 824 user_99997
4936169 SECONDARY-MATCHES None None None None None SECONDARY-MATCHES NaN user_99997
4936170 quest_type None None None None None quest_type customized_request_refresh user_99997

4936171 rows × 9 columns

In [16]:
df_final.drop(['f'], axis = 1,inplace = True) 
In [17]:
df_final.columns = ['a','b','c','d','e','f','G','U_USER_ID']
In [18]:
df_final
Out[18]:
a b c d e f G U_USER_ID
0 ACCOUNTS-SUMMARY DERIVED-ATTRIBUTES AVERAGE-ACCOUNT-AGE-YEAR None None None 0 user_10
1 ACCOUNTS-SUMMARY DERIVED-ATTRIBUTES INQURIES-IN-LAST-SIX-MONTHS None None None 0 user_10
2 ACCOUNTS-SUMMARY DERIVED-ATTRIBUTES LENGTH-OF-CREDIT-HISTORY-MONTH None None None 0 user_10
3 ACCOUNTS-SUMMARY DERIVED-ATTRIBUTES LENGTH-OF-CREDIT-HISTORY-YEAR None None None 1 user_10
4 ACCOUNTS-SUMMARY DERIVED-ATTRIBUTES NEW-ACCOUNTS-IN-LAST-SIX-MONTHS None None None 0 user_10
... ... ... ... ... ... ... ... ...
4936166 SCORES SCORE SCORE-COMMENTS None None None A-Very Low Risk user_99997
4936167 SCORES SCORE SCORE-TYPE None None None PERFORM CONSUMER 2.0 user_99997
4936168 SCORES SCORE SCORE-VALUE None None None 824 user_99997
4936169 SECONDARY-MATCHES None None None None None NaN user_99997
4936170 quest_type None None None None None customized_request_refresh user_99997

4936171 rows × 8 columns

In [19]:
OPEN_DATE = df_final[df_final['d'].str.match('DISBURSED-DT').fillna(False)]
OPEN_DATE.rename(columns = {"G": "OPEN_DATE"},inplace = True)
OPEN_DATE.reset_index(inplace = True) 
OPEN_DATE
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py:4223: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)
Out[19]:
index a b c d e f OPEN_DATE U_USER_ID
0 81 RESPONSES RESPONSE[1] LOAN-DETAILS DISBURSED-DT None None 08-01-2019 user_10
1 99 RESPONSES RESPONSE[2] LOAN-DETAILS DISBURSED-DT None None 05-10-2015 user_10
2 209 RESPONSES RESPONSE[1] LOAN-DETAILS DISBURSED-DT None None 15-07-2015 user_1023
3 238 RESPONSES RESPONSE[2] LOAN-DETAILS DISBURSED-DT None None 20-08-2018 user_1023
4 267 RESPONSES RESPONSE[3] LOAN-DETAILS DISBURSED-DT None None 06-12-2018 user_1023
... ... ... ... ... ... ... ... ... ...
137373 4935973 RESPONSES RESPONSE[7] LOAN-DETAILS DISBURSED-DT None None 31-07-2012 user_99995
137374 4936082 RESPONSES RESPONSE[1] LOAN-DETAILS DISBURSED-DT None None 05-10-2006 user_99997
137375 4936122 RESPONSES RESPONSE[2] LOAN-DETAILS DISBURSED-DT None None 01-08-2012 user_99997
137376 4936140 RESPONSES RESPONSE[3] LOAN-DETAILS DISBURSED-DT None None 30-11-2017 user_99997
137377 4936158 RESPONSES RESPONSE[4] LOAN-DETAILS DISBURSED-DT None None 30-09-2006 user_99997

137378 rows × 9 columns

In [20]:
DATE_CLOSED  = df_final[df_final['d'].str.match('CLOSED-DATE').fillna(False)]
DATE_CLOSED.rename(columns = {"G": "DATE_CLOSED"},inplace = True)
DATE_CLOSED.reset_index(inplace = True) 
DATE_CLOSED
Out[20]:
index a b c d e f DATE_CLOSED U_USER_ID
0 74 RESPONSES RESPONSE[1] LOAN-DETAILS CLOSED-DATE None None NaN user_10
1 92 RESPONSES RESPONSE[2] LOAN-DETAILS CLOSED-DATE None None 27-10-2016 user_10
2 202 RESPONSES RESPONSE[1] LOAN-DETAILS CLOSED-DATE None None NaN user_1023
3 231 RESPONSES RESPONSE[2] LOAN-DETAILS CLOSED-DATE None None NaN user_1023
4 260 RESPONSES RESPONSE[3] LOAN-DETAILS CLOSED-DATE None None NaN user_1023
... ... ... ... ... ... ... ... ... ...
137373 4935966 RESPONSES RESPONSE[7] LOAN-DETAILS CLOSED-DATE None None NaN user_99995
137374 4936075 RESPONSES RESPONSE[1] LOAN-DETAILS CLOSED-DATE None None NaN user_99997
137375 4936115 RESPONSES RESPONSE[2] LOAN-DETAILS CLOSED-DATE None None 29-08-2016 user_99997
137376 4936133 RESPONSES RESPONSE[3] LOAN-DETAILS CLOSED-DATE None None NaN user_99997
137377 4936151 RESPONSES RESPONSE[4] LOAN-DETAILS CLOSED-DATE None None 28-03-2011 user_99997

137378 rows × 9 columns

In [21]:
ACCOUNT_TYPE  = df_final[df_final['d'].str.match('ACCT-TYPE').fillna(False)]
ACCOUNT_TYPE.rename(columns = {"G": "ACCOUNT_TYPE"},inplace = True)
ACCOUNT_TYPE.reset_index(inplace = True) 
ACCOUNT_TYPE
Out[21]:
index a b c d e f ACCOUNT_TYPE U_USER_ID
0 73 RESPONSES RESPONSE[1] LOAN-DETAILS ACCT-TYPE None None Credit Card user_10
1 91 RESPONSES RESPONSE[2] LOAN-DETAILS ACCT-TYPE None None Housing Loan user_10
2 201 RESPONSES RESPONSE[1] LOAN-DETAILS ACCT-TYPE None None Credit Card user_1023
3 230 RESPONSES RESPONSE[2] LOAN-DETAILS ACCT-TYPE None None Credit Card user_1023
4 259 RESPONSES RESPONSE[3] LOAN-DETAILS ACCT-TYPE None None Credit Card user_1023
... ... ... ... ... ... ... ... ... ...
137373 4935965 RESPONSES RESPONSE[7] LOAN-DETAILS ACCT-TYPE None None Credit Card user_99995
137374 4936074 RESPONSES RESPONSE[1] LOAN-DETAILS ACCT-TYPE None None Credit Card user_99997
137375 4936114 RESPONSES RESPONSE[2] LOAN-DETAILS ACCT-TYPE None None Credit Card user_99997
137376 4936132 RESPONSES RESPONSE[3] LOAN-DETAILS ACCT-TYPE None None Credit Card user_99997
137377 4936150 RESPONSES RESPONSE[4] LOAN-DETAILS ACCT-TYPE None None Credit Card user_99997

137378 rows × 9 columns

In [22]:
AMOUNT_PAST_DUE  = df_final[df_final['d'].str.match('OVERDUE-AMT').fillna(False)]
AMOUNT_PAST_DUE.rename(columns = {"G": "AMOUNT_PAST_DUE"},inplace = True)
AMOUNT_PAST_DUE.reset_index(inplace = True) 
AMOUNT_PAST_DUE
Out[22]:
index a b c d e f AMOUNT_PAST_DUE U_USER_ID
0 84 RESPONSES RESPONSE[1] LOAN-DETAILS OVERDUE-AMT None None 0 user_10
1 102 RESPONSES RESPONSE[2] LOAN-DETAILS OVERDUE-AMT None None 0 user_10
2 223 RESPONSES RESPONSE[1] LOAN-DETAILS OVERDUE-AMT None None NaN user_1023
3 252 RESPONSES RESPONSE[2] LOAN-DETAILS OVERDUE-AMT None None 0 user_1023
4 270 RESPONSES RESPONSE[3] LOAN-DETAILS OVERDUE-AMT None None NaN user_1023
... ... ... ... ... ... ... ... ... ...
137373 4935976 RESPONSES RESPONSE[7] LOAN-DETAILS OVERDUE-AMT None None 0 user_99995
137374 4936107 RESPONSES RESPONSE[1] LOAN-DETAILS OVERDUE-AMT None None 0 user_99997
137375 4936125 RESPONSES RESPONSE[2] LOAN-DETAILS OVERDUE-AMT None None 0 user_99997
137376 4936143 RESPONSES RESPONSE[3] LOAN-DETAILS OVERDUE-AMT None None 0 user_99997
137377 4936161 RESPONSES RESPONSE[4] LOAN-DETAILS OVERDUE-AMT None None 0 user_99997

137378 rows × 9 columns

In [23]:
ACCOUNT_STATUS  = df_final[df_final['d'].str.match('ACCOUNT-STATUS').fillna(False)]
ACCOUNT_STATUS.rename(columns = {"G": "ACCOUNT_STATUS"},inplace = True)
ACCOUNT_STATUS.reset_index(inplace = True) 
ACCOUNT_STATUS
Out[23]:
index a b c d e f ACCOUNT_STATUS U_USER_ID
0 71 RESPONSES RESPONSE[1] LOAN-DETAILS ACCOUNT-STATUS None None Active user_10
1 89 RESPONSES RESPONSE[2] LOAN-DETAILS ACCOUNT-STATUS None None Closed user_10
2 199 RESPONSES RESPONSE[1] LOAN-DETAILS ACCOUNT-STATUS None None Active user_1023
3 228 RESPONSES RESPONSE[2] LOAN-DETAILS ACCOUNT-STATUS None None Active user_1023
4 257 RESPONSES RESPONSE[3] LOAN-DETAILS ACCOUNT-STATUS None None Active user_1023
... ... ... ... ... ... ... ... ... ...
137373 4935963 RESPONSES RESPONSE[7] LOAN-DETAILS ACCOUNT-STATUS None None Active user_99995
137374 4936072 RESPONSES RESPONSE[1] LOAN-DETAILS ACCOUNT-STATUS None None Active user_99997
137375 4936112 RESPONSES RESPONSE[2] LOAN-DETAILS ACCOUNT-STATUS None None Closed user_99997
137376 4936130 RESPONSES RESPONSE[3] LOAN-DETAILS ACCOUNT-STATUS None None Active user_99997
137377 4936148 RESPONSES RESPONSE[4] LOAN-DETAILS ACCOUNT-STATUS None None Closed user_99997

137378 rows × 9 columns

In [24]:
ACCOUNTHOLDER_TYPE_CODE  = df_final[df_final['d'].str.match('OWNERSHIP-IND').fillna(False)]
ACCOUNTHOLDER_TYPE_CODE.rename(columns = {"G": "ACCOUNTHOLDER_TYPE_CODE"},inplace = True)
ACCOUNTHOLDER_TYPE_CODE.reset_index(inplace = True) 
ACCOUNTHOLDER_TYPE_CODE
Out[24]:
index a b c d e f ACCOUNTHOLDER_TYPE_CODE U_USER_ID
0 85 RESPONSES RESPONSE[1] LOAN-DETAILS OWNERSHIP-IND None None Individual user_10
1 103 RESPONSES RESPONSE[2] LOAN-DETAILS OWNERSHIP-IND None None Joint user_10
2 224 RESPONSES RESPONSE[1] LOAN-DETAILS OWNERSHIP-IND None None Individual user_1023
3 253 RESPONSES RESPONSE[2] LOAN-DETAILS OWNERSHIP-IND None None Individual user_1023
4 271 RESPONSES RESPONSE[3] LOAN-DETAILS OWNERSHIP-IND None None Individual user_1023
... ... ... ... ... ... ... ... ... ...
137373 4935977 RESPONSES RESPONSE[7] LOAN-DETAILS OWNERSHIP-IND None None Individual user_99995
137374 4936108 RESPONSES RESPONSE[1] LOAN-DETAILS OWNERSHIP-IND None None Individual user_99997
137375 4936126 RESPONSES RESPONSE[2] LOAN-DETAILS OWNERSHIP-IND None None Individual user_99997
137376 4936144 RESPONSES RESPONSE[3] LOAN-DETAILS OWNERSHIP-IND None None Individual user_99997
137377 4936162 RESPONSES RESPONSE[4] LOAN-DETAILS OWNERSHIP-IND None None Individual user_99997

137378 rows × 9 columns

In [25]:
HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT   = df_final[df_final['d'].str.match('DISBURSED-AMT').fillna(False)]
HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT.rename(columns = {"G": "HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT"},inplace = True)
HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT.reset_index(inplace = True) 
HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT
Out[25]:
index a b c d e f HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT U_USER_ID
0 80 RESPONSES RESPONSE[1] LOAN-DETAILS DISBURSED-AMT None None 20 727 user_10
1 98 RESPONSES RESPONSE[2] LOAN-DETAILS DISBURSED-AMT None None 15 00 000 user_10
2 208 RESPONSES RESPONSE[1] LOAN-DETAILS DISBURSED-AMT None None 63 705 user_1023
3 237 RESPONSES RESPONSE[2] LOAN-DETAILS DISBURSED-AMT None None 27 042 user_1023
4 266 RESPONSES RESPONSE[3] LOAN-DETAILS DISBURSED-AMT None None 7 327 user_1023
... ... ... ... ... ... ... ... ... ...
137373 4935972 RESPONSES RESPONSE[7] LOAN-DETAILS DISBURSED-AMT None None 1 25 179 user_99995
137374 4936081 RESPONSES RESPONSE[1] LOAN-DETAILS DISBURSED-AMT None None 1 07 686 user_99997
137375 4936121 RESPONSES RESPONSE[2] LOAN-DETAILS DISBURSED-AMT None None NaN user_99997
137376 4936139 RESPONSES RESPONSE[3] LOAN-DETAILS DISBURSED-AMT None None 1 536 user_99997
137377 4936157 RESPONSES RESPONSE[4] LOAN-DETAILS DISBURSED-AMT None None 10 395 user_99997

137378 rows × 9 columns

In [26]:
CURRENT_BALANCE    = df_final[df_final['d'].str.match('CURRENT-BAL').fillna(False)]
CURRENT_BALANCE.rename(columns = {"G": "CURRENT_BALANCE"},inplace = True)
CURRENT_BALANCE.reset_index(inplace = True) 
CURRENT_BALANCE
Out[26]:
index a b c d e f CURRENT_BALANCE U_USER_ID
0 78 RESPONSES RESPONSE[1] LOAN-DETAILS CURRENT-BAL None None 4 193 user_10
1 96 RESPONSES RESPONSE[2] LOAN-DETAILS CURRENT-BAL None None 0 user_10
2 206 RESPONSES RESPONSE[1] LOAN-DETAILS CURRENT-BAL None None 103 user_1023
3 235 RESPONSES RESPONSE[2] LOAN-DETAILS CURRENT-BAL None None 16 121 user_1023
4 264 RESPONSES RESPONSE[3] LOAN-DETAILS CURRENT-BAL None None 0 user_1023
... ... ... ... ... ... ... ... ... ...
137373 4935970 RESPONSES RESPONSE[7] LOAN-DETAILS CURRENT-BAL None None 16 942 user_99995
137374 4936079 RESPONSES RESPONSE[1] LOAN-DETAILS CURRENT-BAL None None 16 290 user_99997
137375 4936119 RESPONSES RESPONSE[2] LOAN-DETAILS CURRENT-BAL None None 0 user_99997
137376 4936137 RESPONSES RESPONSE[3] LOAN-DETAILS CURRENT-BAL None None -689 user_99997
137377 4936155 RESPONSES RESPONSE[4] LOAN-DETAILS CURRENT-BAL None None 0 user_99997

137378 rows × 9 columns

In [27]:
Credit_Limit_Amount      = df_final[df_final['d'].str.match('CREDIT-LIMIT').fillna(False)]
Credit_Limit_Amount.rename(columns = {"G": "Credit_Limit_Amount"}, inplace = True)
Credit_Limit_Amount.reset_index(inplace = True) 
Credit_Limit_Amount
Out[27]:
index a b c d e f Credit_Limit_Amount U_USER_ID
0 77 RESPONSES RESPONSE[1] LOAN-DETAILS CREDIT-LIMIT None None 20 000 user_10
1 95 RESPONSES RESPONSE[2] LOAN-DETAILS CREDIT-LIMIT None None NaN user_10
2 205 RESPONSES RESPONSE[1] LOAN-DETAILS CREDIT-LIMIT None None NaN user_1023
3 234 RESPONSES RESPONSE[2] LOAN-DETAILS CREDIT-LIMIT None None 61 000 user_1023
4 263 RESPONSES RESPONSE[3] LOAN-DETAILS CREDIT-LIMIT None None NaN user_1023
... ... ... ... ... ... ... ... ... ...
137373 4935969 RESPONSES RESPONSE[7] LOAN-DETAILS CREDIT-LIMIT None None 1 26 000 user_99995
137374 4936078 RESPONSES RESPONSE[1] LOAN-DETAILS CREDIT-LIMIT None None 1 02 000 user_99997
137375 4936118 RESPONSES RESPONSE[2] LOAN-DETAILS CREDIT-LIMIT None None NaN user_99997
137376 4936136 RESPONSES RESPONSE[3] LOAN-DETAILS CREDIT-LIMIT None None 63 000 user_99997
137377 4936154 RESPONSES RESPONSE[4] LOAN-DETAILS CREDIT-LIMIT None None NaN user_99997

137378 rows × 9 columns

In [28]:
REPORT_DATE = df_final[df_final['b'].str.match('DATE-OF-REQUEST').fillna(False)]
REPORT_DATE.rename(columns = {"G": "DATE-OF-REQUEST"}, inplace = True)
REPORT_DATE.reset_index(inplace = True) 
REPORT_DATE
Out[28]:
index a b c d e f DATE-OF-REQUEST U_USER_ID
0 29 HEADER DATE-OF-REQUEST None None None None 19-09-2019 user_10
1 142 HEADER DATE-OF-REQUEST None None None None 19-09-2019 user_1023
2 313 HEADER DATE-OF-REQUEST None None None None 19-09-2019 user_1040
3 1015 HEADER DATE-OF-REQUEST None None None None 19-09-2019 user_1041
4 1244 HEADER DATE-OF-REQUEST None None None None 20-09-2019 user_1042
... ... ... ... ... ... ... ... ... ...
18120 4935026 HEADER DATE-OF-REQUEST None None None None 26-09-2019 user_99992
18121 4935308 HEADER DATE-OF-REQUEST None None None None 29-11-2018 user_99993
18122 4935427 HEADER DATE-OF-REQUEST None None None None 26-09-2019 user_99994
18123 4935708 HEADER DATE-OF-REQUEST None None None None 26-09-2019 user_99995
18124 4936014 HEADER DATE-OF-REQUEST None None None None 26-09-2019 user_99997

18125 rows × 9 columns

In [29]:
REPORT_DATE = REPORT_DATE.drop(['index','a','b','c','d','e','f'],axis=1)
In [30]:
REPORT_DATE
Out[30]:
DATE-OF-REQUEST U_USER_ID
0 19-09-2019 user_10
1 19-09-2019 user_1023
2 19-09-2019 user_1040
3 19-09-2019 user_1041
4 20-09-2019 user_1042
... ... ...
18120 26-09-2019 user_99992
18121 29-11-2018 user_99993
18122 26-09-2019 user_99994
18123 26-09-2019 user_99995
18124 26-09-2019 user_99997

18125 rows × 2 columns

In [31]:
REPORT_DATE['DATE-OF-REQUEST']= pd.to_datetime(REPORT_DATE['DATE-OF-REQUEST'], errors = 'coerce')
In [32]:
REPORT_DATE
Out[32]:
DATE-OF-REQUEST U_USER_ID
0 2019-09-19 user_10
1 2019-09-19 user_1023
2 2019-09-19 user_1040
3 2019-09-19 user_1041
4 2019-09-20 user_1042
... ... ...
18120 2019-09-26 user_99992
18121 2018-11-29 user_99993
18122 2019-09-26 user_99994
18123 2019-09-26 user_99995
18124 2019-09-26 user_99997

18125 rows × 2 columns

In [ ]:
 
In [33]:
REPORT_DATE.to_csv(r'C:\Users\nitishkumar\Desktop\P\Cred\Data\Combined_Data\Final_converted_sheet_accounts\Report_date_v5.csv', index= False)
In [ ]:
 
In [ ]:
 
In [34]:
Account_Table = pd.concat([OPEN_DATE,DATE_CLOSED,ACCOUNT_TYPE,AMOUNT_PAST_DUE,ACCOUNT_STATUS,ACCOUNTHOLDER_TYPE_CODE,HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT,CURRENT_BALANCE,Credit_Limit_Amount], axis=1,)
In [35]:
Account_Table
Out[35]:
index a b c d e f OPEN_DATE U_USER_ID index ... U_USER_ID index a b c d e f Credit_Limit_Amount U_USER_ID
0 81 RESPONSES RESPONSE[1] LOAN-DETAILS DISBURSED-DT None None 08-01-2019 user_10 74 ... user_10 77 RESPONSES RESPONSE[1] LOAN-DETAILS CREDIT-LIMIT None None 20 000 user_10
1 99 RESPONSES RESPONSE[2] LOAN-DETAILS DISBURSED-DT None None 05-10-2015 user_10 92 ... user_10 95 RESPONSES RESPONSE[2] LOAN-DETAILS CREDIT-LIMIT None None NaN user_10
2 209 RESPONSES RESPONSE[1] LOAN-DETAILS DISBURSED-DT None None 15-07-2015 user_1023 202 ... user_1023 205 RESPONSES RESPONSE[1] LOAN-DETAILS CREDIT-LIMIT None None NaN user_1023
3 238 RESPONSES RESPONSE[2] LOAN-DETAILS DISBURSED-DT None None 20-08-2018 user_1023 231 ... user_1023 234 RESPONSES RESPONSE[2] LOAN-DETAILS CREDIT-LIMIT None None 61 000 user_1023
4 267 RESPONSES RESPONSE[3] LOAN-DETAILS DISBURSED-DT None None 06-12-2018 user_1023 260 ... user_1023 263 RESPONSES RESPONSE[3] LOAN-DETAILS CREDIT-LIMIT None None NaN user_1023
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
137373 4935973 RESPONSES RESPONSE[7] LOAN-DETAILS DISBURSED-DT None None 31-07-2012 user_99995 4935966 ... user_99995 4935969 RESPONSES RESPONSE[7] LOAN-DETAILS CREDIT-LIMIT None None 1 26 000 user_99995
137374 4936082 RESPONSES RESPONSE[1] LOAN-DETAILS DISBURSED-DT None None 05-10-2006 user_99997 4936075 ... user_99997 4936078 RESPONSES RESPONSE[1] LOAN-DETAILS CREDIT-LIMIT None None 1 02 000 user_99997
137375 4936122 RESPONSES RESPONSE[2] LOAN-DETAILS DISBURSED-DT None None 01-08-2012 user_99997 4936115 ... user_99997 4936118 RESPONSES RESPONSE[2] LOAN-DETAILS CREDIT-LIMIT None None NaN user_99997
137376 4936140 RESPONSES RESPONSE[3] LOAN-DETAILS DISBURSED-DT None None 30-11-2017 user_99997 4936133 ... user_99997 4936136 RESPONSES RESPONSE[3] LOAN-DETAILS CREDIT-LIMIT None None 63 000 user_99997
137377 4936158 RESPONSES RESPONSE[4] LOAN-DETAILS DISBURSED-DT None None 30-09-2006 user_99997 4936151 ... user_99997 4936154 RESPONSES RESPONSE[4] LOAN-DETAILS CREDIT-LIMIT None None NaN user_99997

137378 rows × 81 columns

In [36]:
Account_Table.drop(['index','a','b','c','d','e','f'], axis = 1,inplace = True)
In [37]:
Account_Table
Out[37]:
OPEN_DATE U_USER_ID DATE_CLOSED U_USER_ID ACCOUNT_TYPE U_USER_ID AMOUNT_PAST_DUE U_USER_ID ACCOUNT_STATUS U_USER_ID ACCOUNTHOLDER_TYPE_CODE U_USER_ID HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT U_USER_ID CURRENT_BALANCE U_USER_ID Credit_Limit_Amount U_USER_ID
0 08-01-2019 user_10 NaN user_10 Credit Card user_10 0 user_10 Active user_10 Individual user_10 20 727 user_10 4 193 user_10 20 000 user_10
1 05-10-2015 user_10 27-10-2016 user_10 Housing Loan user_10 0 user_10 Closed user_10 Joint user_10 15 00 000 user_10 0 user_10 NaN user_10
2 15-07-2015 user_1023 NaN user_1023 Credit Card user_1023 NaN user_1023 Active user_1023 Individual user_1023 63 705 user_1023 103 user_1023 NaN user_1023
3 20-08-2018 user_1023 NaN user_1023 Credit Card user_1023 0 user_1023 Active user_1023 Individual user_1023 27 042 user_1023 16 121 user_1023 61 000 user_1023
4 06-12-2018 user_1023 NaN user_1023 Credit Card user_1023 NaN user_1023 Active user_1023 Individual user_1023 7 327 user_1023 0 user_1023 NaN user_1023
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
137373 31-07-2012 user_99995 NaN user_99995 Credit Card user_99995 0 user_99995 Active user_99995 Individual user_99995 1 25 179 user_99995 16 942 user_99995 1 26 000 user_99995
137374 05-10-2006 user_99997 NaN user_99997 Credit Card user_99997 0 user_99997 Active user_99997 Individual user_99997 1 07 686 user_99997 16 290 user_99997 1 02 000 user_99997
137375 01-08-2012 user_99997 29-08-2016 user_99997 Credit Card user_99997 0 user_99997 Closed user_99997 Individual user_99997 NaN user_99997 0 user_99997 NaN user_99997
137376 30-11-2017 user_99997 NaN user_99997 Credit Card user_99997 0 user_99997 Active user_99997 Individual user_99997 1 536 user_99997 -689 user_99997 63 000 user_99997
137377 30-09-2006 user_99997 28-03-2011 user_99997 Credit Card user_99997 0 user_99997 Closed user_99997 Individual user_99997 10 395 user_99997 0 user_99997 NaN user_99997

137378 rows × 18 columns

In [ ]:
 
In [38]:
Account_Table.columns = ['OPEN_DATE','U_USER_ID','DATE_CLOSED','u_USER_ID','ACCOUNT_TYPE','u_USER_ID','AMOUNT_PAST_DUE','u_USER_ID','ACCOUNT_STATUS','u_USER_ID','ACCOUNTHOLDER_TYPE_CODE','u_USER_ID','HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT','u_USER_ID','CURRENT_BALANCE','u_USER_ID','Credit_Limit_Amount','u_USER_ID']
In [39]:
Account_Table
Out[39]:
OPEN_DATE U_USER_ID DATE_CLOSED u_USER_ID ACCOUNT_TYPE u_USER_ID AMOUNT_PAST_DUE u_USER_ID ACCOUNT_STATUS u_USER_ID ACCOUNTHOLDER_TYPE_CODE u_USER_ID HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT u_USER_ID CURRENT_BALANCE u_USER_ID Credit_Limit_Amount u_USER_ID
0 08-01-2019 user_10 NaN user_10 Credit Card user_10 0 user_10 Active user_10 Individual user_10 20 727 user_10 4 193 user_10 20 000 user_10
1 05-10-2015 user_10 27-10-2016 user_10 Housing Loan user_10 0 user_10 Closed user_10 Joint user_10 15 00 000 user_10 0 user_10 NaN user_10
2 15-07-2015 user_1023 NaN user_1023 Credit Card user_1023 NaN user_1023 Active user_1023 Individual user_1023 63 705 user_1023 103 user_1023 NaN user_1023
3 20-08-2018 user_1023 NaN user_1023 Credit Card user_1023 0 user_1023 Active user_1023 Individual user_1023 27 042 user_1023 16 121 user_1023 61 000 user_1023
4 06-12-2018 user_1023 NaN user_1023 Credit Card user_1023 NaN user_1023 Active user_1023 Individual user_1023 7 327 user_1023 0 user_1023 NaN user_1023
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
137373 31-07-2012 user_99995 NaN user_99995 Credit Card user_99995 0 user_99995 Active user_99995 Individual user_99995 1 25 179 user_99995 16 942 user_99995 1 26 000 user_99995
137374 05-10-2006 user_99997 NaN user_99997 Credit Card user_99997 0 user_99997 Active user_99997 Individual user_99997 1 07 686 user_99997 16 290 user_99997 1 02 000 user_99997
137375 01-08-2012 user_99997 29-08-2016 user_99997 Credit Card user_99997 0 user_99997 Closed user_99997 Individual user_99997 NaN user_99997 0 user_99997 NaN user_99997
137376 30-11-2017 user_99997 NaN user_99997 Credit Card user_99997 0 user_99997 Active user_99997 Individual user_99997 1 536 user_99997 -689 user_99997 63 000 user_99997
137377 30-09-2006 user_99997 28-03-2011 user_99997 Credit Card user_99997 0 user_99997 Closed user_99997 Individual user_99997 10 395 user_99997 0 user_99997 NaN user_99997

137378 rows × 18 columns

In [40]:
Account_Table.drop(['u_USER_ID'], axis = 1,inplace = True)
In [41]:
Account_Table
Out[41]:
OPEN_DATE U_USER_ID DATE_CLOSED ACCOUNT_TYPE AMOUNT_PAST_DUE ACCOUNT_STATUS ACCOUNTHOLDER_TYPE_CODE HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT CURRENT_BALANCE Credit_Limit_Amount
0 08-01-2019 user_10 NaN Credit Card 0 Active Individual 20 727 4 193 20 000
1 05-10-2015 user_10 27-10-2016 Housing Loan 0 Closed Joint 15 00 000 0 NaN
2 15-07-2015 user_1023 NaN Credit Card NaN Active Individual 63 705 103 NaN
3 20-08-2018 user_1023 NaN Credit Card 0 Active Individual 27 042 16 121 61 000
4 06-12-2018 user_1023 NaN Credit Card NaN Active Individual 7 327 0 NaN
... ... ... ... ... ... ... ... ... ... ...
137373 31-07-2012 user_99995 NaN Credit Card 0 Active Individual 1 25 179 16 942 1 26 000
137374 05-10-2006 user_99997 NaN Credit Card 0 Active Individual 1 07 686 16 290 1 02 000
137375 01-08-2012 user_99997 29-08-2016 Credit Card 0 Closed Individual NaN 0 NaN
137376 30-11-2017 user_99997 NaN Credit Card 0 Active Individual 1 536 -689 63 000
137377 30-09-2006 user_99997 28-03-2011 Credit Card 0 Closed Individual 10 395 0 NaN

137378 rows × 10 columns

In [42]:
Account_Table.insert(0, "XML_SEQ_NO",1)
In [43]:
Account_Table.insert(2, "Account_Number",'NA')
In [44]:
Account_Table.shape
Out[44]:
(137378, 12)
In [45]:
Account_Table
Out[45]:
XML_SEQ_NO OPEN_DATE Account_Number U_USER_ID DATE_CLOSED ACCOUNT_TYPE AMOUNT_PAST_DUE ACCOUNT_STATUS ACCOUNTHOLDER_TYPE_CODE HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT CURRENT_BALANCE Credit_Limit_Amount
0 1 08-01-2019 NA user_10 NaN Credit Card 0 Active Individual 20 727 4 193 20 000
1 1 05-10-2015 NA user_10 27-10-2016 Housing Loan 0 Closed Joint 15 00 000 0 NaN
2 1 15-07-2015 NA user_1023 NaN Credit Card NaN Active Individual 63 705 103 NaN
3 1 20-08-2018 NA user_1023 NaN Credit Card 0 Active Individual 27 042 16 121 61 000
4 1 06-12-2018 NA user_1023 NaN Credit Card NaN Active Individual 7 327 0 NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
137373 1 31-07-2012 NA user_99995 NaN Credit Card 0 Active Individual 1 25 179 16 942 1 26 000
137374 1 05-10-2006 NA user_99997 NaN Credit Card 0 Active Individual 1 07 686 16 290 1 02 000
137375 1 01-08-2012 NA user_99997 29-08-2016 Credit Card 0 Closed Individual NaN 0 NaN
137376 1 30-11-2017 NA user_99997 NaN Credit Card 0 Active Individual 1 536 -689 63 000
137377 1 30-09-2006 NA user_99997 28-03-2011 Credit Card 0 Closed Individual 10 395 0 NaN

137378 rows × 12 columns

In [46]:
Account_Table = Account_Table[['U_USER_ID','XML_SEQ_NO','Account_Number','OPEN_DATE','DATE_CLOSED','ACCOUNT_TYPE','AMOUNT_PAST_DUE','ACCOUNT_STATUS','ACCOUNTHOLDER_TYPE_CODE','HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT','CURRENT_BALANCE','Credit_Limit_Amount']]
In [47]:
Account_Table
Out[47]:
U_USER_ID XML_SEQ_NO Account_Number OPEN_DATE DATE_CLOSED ACCOUNT_TYPE AMOUNT_PAST_DUE ACCOUNT_STATUS ACCOUNTHOLDER_TYPE_CODE HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT CURRENT_BALANCE Credit_Limit_Amount
0 user_10 1 NA 08-01-2019 NaN Credit Card 0 Active Individual 20 727 4 193 20 000
1 user_10 1 NA 05-10-2015 27-10-2016 Housing Loan 0 Closed Joint 15 00 000 0 NaN
2 user_1023 1 NA 15-07-2015 NaN Credit Card NaN Active Individual 63 705 103 NaN
3 user_1023 1 NA 20-08-2018 NaN Credit Card 0 Active Individual 27 042 16 121 61 000
4 user_1023 1 NA 06-12-2018 NaN Credit Card NaN Active Individual 7 327 0 NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
137373 user_99995 1 NA 31-07-2012 NaN Credit Card 0 Active Individual 1 25 179 16 942 1 26 000
137374 user_99997 1 NA 05-10-2006 NaN Credit Card 0 Active Individual 1 07 686 16 290 1 02 000
137375 user_99997 1 NA 01-08-2012 29-08-2016 Credit Card 0 Closed Individual NaN 0 NaN
137376 user_99997 1 NA 30-11-2017 NaN Credit Card 0 Active Individual 1 536 -689 63 000
137377 user_99997 1 NA 30-09-2006 28-03-2011 Credit Card 0 Closed Individual 10 395 0 NaN

137378 rows × 12 columns

In [ ]:
 
In [48]:
Account_Table
Out[48]:
U_USER_ID XML_SEQ_NO Account_Number OPEN_DATE DATE_CLOSED ACCOUNT_TYPE AMOUNT_PAST_DUE ACCOUNT_STATUS ACCOUNTHOLDER_TYPE_CODE HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT CURRENT_BALANCE Credit_Limit_Amount
0 user_10 1 NA 08-01-2019 NaN Credit Card 0 Active Individual 20 727 4 193 20 000
1 user_10 1 NA 05-10-2015 27-10-2016 Housing Loan 0 Closed Joint 15 00 000 0 NaN
2 user_1023 1 NA 15-07-2015 NaN Credit Card NaN Active Individual 63 705 103 NaN
3 user_1023 1 NA 20-08-2018 NaN Credit Card 0 Active Individual 27 042 16 121 61 000
4 user_1023 1 NA 06-12-2018 NaN Credit Card NaN Active Individual 7 327 0 NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
137373 user_99995 1 NA 31-07-2012 NaN Credit Card 0 Active Individual 1 25 179 16 942 1 26 000
137374 user_99997 1 NA 05-10-2006 NaN Credit Card 0 Active Individual 1 07 686 16 290 1 02 000
137375 user_99997 1 NA 01-08-2012 29-08-2016 Credit Card 0 Closed Individual NaN 0 NaN
137376 user_99997 1 NA 30-11-2017 NaN Credit Card 0 Active Individual 1 536 -689 63 000
137377 user_99997 1 NA 30-09-2006 28-03-2011 Credit Card 0 Closed Individual 10 395 0 NaN

137378 rows × 12 columns

In [49]:
def f(flag):
    global previous_seq
    previous_seq = 1 if flag == 0 else previous_seq + 1
    return previous_seq

previous_seq = 0
Account_Table['seq'] = Account_Table[['U_USER_ID']].apply(lambda x: f(*x), axis=1)
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
In [50]:
Account_Table
Out[50]:
U_USER_ID XML_SEQ_NO Account_Number OPEN_DATE DATE_CLOSED ACCOUNT_TYPE AMOUNT_PAST_DUE ACCOUNT_STATUS ACCOUNTHOLDER_TYPE_CODE HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT CURRENT_BALANCE Credit_Limit_Amount seq
0 user_10 1 NA 08-01-2019 NaN Credit Card 0 Active Individual 20 727 4 193 20 000 1
1 user_10 1 NA 05-10-2015 27-10-2016 Housing Loan 0 Closed Joint 15 00 000 0 NaN 2
2 user_1023 1 NA 15-07-2015 NaN Credit Card NaN Active Individual 63 705 103 NaN 3
3 user_1023 1 NA 20-08-2018 NaN Credit Card 0 Active Individual 27 042 16 121 61 000 4
4 user_1023 1 NA 06-12-2018 NaN Credit Card NaN Active Individual 7 327 0 NaN 5
... ... ... ... ... ... ... ... ... ... ... ... ... ...
137373 user_99995 1 NA 31-07-2012 NaN Credit Card 0 Active Individual 1 25 179 16 942 1 26 000 137374
137374 user_99997 1 NA 05-10-2006 NaN Credit Card 0 Active Individual 1 07 686 16 290 1 02 000 137375
137375 user_99997 1 NA 01-08-2012 29-08-2016 Credit Card 0 Closed Individual NaN 0 NaN 137376
137376 user_99997 1 NA 30-11-2017 NaN Credit Card 0 Active Individual 1 536 -689 63 000 137377
137377 user_99997 1 NA 30-09-2006 28-03-2011 Credit Card 0 Closed Individual 10 395 0 NaN 137378

137378 rows × 13 columns

In [51]:
Account_Table['seq'] = Account_Table['seq'].astype(str)
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
In [52]:
Account_Table = Account_Table.drop(['Account_Number'], axis = 1) 
In [53]:
Account_Table['ACCOUNT_NUMBER'] = Account_Table['U_USER_ID'] + '_'+ Account_Table['seq']
Account_Table
Out[53]:
U_USER_ID XML_SEQ_NO OPEN_DATE DATE_CLOSED ACCOUNT_TYPE AMOUNT_PAST_DUE ACCOUNT_STATUS ACCOUNTHOLDER_TYPE_CODE HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT CURRENT_BALANCE Credit_Limit_Amount seq ACCOUNT_NUMBER
0 user_10 1 08-01-2019 NaN Credit Card 0 Active Individual 20 727 4 193 20 000 1 user_10_1
1 user_10 1 05-10-2015 27-10-2016 Housing Loan 0 Closed Joint 15 00 000 0 NaN 2 user_10_2
2 user_1023 1 15-07-2015 NaN Credit Card NaN Active Individual 63 705 103 NaN 3 user_1023_3
3 user_1023 1 20-08-2018 NaN Credit Card 0 Active Individual 27 042 16 121 61 000 4 user_1023_4
4 user_1023 1 06-12-2018 NaN Credit Card NaN Active Individual 7 327 0 NaN 5 user_1023_5
... ... ... ... ... ... ... ... ... ... ... ... ... ...
137373 user_99995 1 31-07-2012 NaN Credit Card 0 Active Individual 1 25 179 16 942 1 26 000 137374 user_99995_137374
137374 user_99997 1 05-10-2006 NaN Credit Card 0 Active Individual 1 07 686 16 290 1 02 000 137375 user_99997_137375
137375 user_99997 1 01-08-2012 29-08-2016 Credit Card 0 Closed Individual NaN 0 NaN 137376 user_99997_137376
137376 user_99997 1 30-11-2017 NaN Credit Card 0 Active Individual 1 536 -689 63 000 137377 user_99997_137377
137377 user_99997 1 30-09-2006 28-03-2011 Credit Card 0 Closed Individual 10 395 0 NaN 137378 user_99997_137378

137378 rows × 13 columns

In [54]:
Account_Table['ACCOUNT_STATUS'].value_counts()
Out[54]:
Active    83318
Closed    54060
Name: ACCOUNT_STATUS, dtype: int64
In [55]:
Account_Table['HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT'] = Account_Table['HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT'].str.replace(' ', '')
In [56]:
Account_Table
Out[56]:
U_USER_ID XML_SEQ_NO OPEN_DATE DATE_CLOSED ACCOUNT_TYPE AMOUNT_PAST_DUE ACCOUNT_STATUS ACCOUNTHOLDER_TYPE_CODE HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT CURRENT_BALANCE Credit_Limit_Amount seq ACCOUNT_NUMBER
0 user_10 1 08-01-2019 NaN Credit Card 0 Active Individual 20727 4 193 20 000 1 user_10_1
1 user_10 1 05-10-2015 27-10-2016 Housing Loan 0 Closed Joint 1500000 0 NaN 2 user_10_2
2 user_1023 1 15-07-2015 NaN Credit Card NaN Active Individual 63705 103 NaN 3 user_1023_3
3 user_1023 1 20-08-2018 NaN Credit Card 0 Active Individual 27042 16 121 61 000 4 user_1023_4
4 user_1023 1 06-12-2018 NaN Credit Card NaN Active Individual 7327 0 NaN 5 user_1023_5
... ... ... ... ... ... ... ... ... ... ... ... ... ...
137373 user_99995 1 31-07-2012 NaN Credit Card 0 Active Individual 125179 16 942 1 26 000 137374 user_99995_137374
137374 user_99997 1 05-10-2006 NaN Credit Card 0 Active Individual 107686 16 290 1 02 000 137375 user_99997_137375
137375 user_99997 1 01-08-2012 29-08-2016 Credit Card 0 Closed Individual NaN 0 NaN 137376 user_99997_137376
137376 user_99997 1 30-11-2017 NaN Credit Card 0 Active Individual 1536 -689 63 000 137377 user_99997_137377
137377 user_99997 1 30-09-2006 28-03-2011 Credit Card 0 Closed Individual 10395 0 NaN 137378 user_99997_137378

137378 rows × 13 columns

In [57]:
Account_Table['CURRENT_BALANCE'] = Account_Table['CURRENT_BALANCE'].str.replace(' ', '')
In [58]:
Account_Table['Credit_Limit_Amount'] = Account_Table['Credit_Limit_Amount'].str.replace(' ', '')
In [59]:
#Account_Table = Account_Table.fillna(0)
In [60]:
Account_Table
Out[60]:
U_USER_ID XML_SEQ_NO OPEN_DATE DATE_CLOSED ACCOUNT_TYPE AMOUNT_PAST_DUE ACCOUNT_STATUS ACCOUNTHOLDER_TYPE_CODE HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT CURRENT_BALANCE Credit_Limit_Amount seq ACCOUNT_NUMBER
0 user_10 1 08-01-2019 NaN Credit Card 0 Active Individual 20727 4193 20000 1 user_10_1
1 user_10 1 05-10-2015 27-10-2016 Housing Loan 0 Closed Joint 1500000 0 NaN 2 user_10_2
2 user_1023 1 15-07-2015 NaN Credit Card NaN Active Individual 63705 103 NaN 3 user_1023_3
3 user_1023 1 20-08-2018 NaN Credit Card 0 Active Individual 27042 16121 61000 4 user_1023_4
4 user_1023 1 06-12-2018 NaN Credit Card NaN Active Individual 7327 0 NaN 5 user_1023_5
... ... ... ... ... ... ... ... ... ... ... ... ... ...
137373 user_99995 1 31-07-2012 NaN Credit Card 0 Active Individual 125179 16942 126000 137374 user_99995_137374
137374 user_99997 1 05-10-2006 NaN Credit Card 0 Active Individual 107686 16290 102000 137375 user_99997_137375
137375 user_99997 1 01-08-2012 29-08-2016 Credit Card 0 Closed Individual NaN 0 NaN 137376 user_99997_137376
137376 user_99997 1 30-11-2017 NaN Credit Card 0 Active Individual 1536 -689 63000 137377 user_99997_137377
137377 user_99997 1 30-09-2006 28-03-2011 Credit Card 0 Closed Individual 10395 0 NaN 137378 user_99997_137378

137378 rows × 13 columns

In [61]:
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Credit Card', 10,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Housing Loan',2,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Property Loan',3,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Personal Loan',5,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Consumer Loan', 6,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Gold Loan', 7,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Auto Loan (Personal)', 1,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Loan to Professional', 9,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Secured Credit Card', 31,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Used Car Loan', 32,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Business Loan Priority Sector Agriculture', 51,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Business Loan General', 51,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Business Loan Priority Sector Small Business', 51,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Business Loan Unsecured', 51,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Business Loan Against Bank Deposits', 51,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Business Loan Priority Sector Others', 51,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Business Loan - Secured', 51,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Business Non-Funded Credit Facility General', 51,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Business Non-Funded Credit Facility-Priority Sector- Small Business', 51,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Other', 0,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Overdraft', 0,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Education Loan', 0,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Corporate Credit Card', 0,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Loan Against Bank Deposits', 0,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Loan Against Shares / Securities', 0,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Commercial Vehicle Loan', 0,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Two-Wheeler Loan', 0,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Kisan Credit Card', 0,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Pradhan Mantri Awas Yojana - CLSS', 0,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='JLG Individual', 0,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Non-Funded Credit Facility', 0,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Microfinance Personal Loan', 0,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Mudra Loans Shishu / Kishor / Tarun', 0,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Construction Equipment Loan', 0,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Tractor Loan', 0,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Commercial Equipment Loan', 0,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Individual', 0,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Prime Minister Jaan Dhan Yojana - Overdraft', 0,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Loan on Credit Card', 0,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Microfinance Business Loan', 0,Account_Table.ACCOUNT_TYPE)
Account_Table.ACCOUNT_TYPE = np.where(Account_Table.ACCOUNT_TYPE=='Microfinance Housing Loan', 0,Account_Table.ACCOUNT_TYPE)
In [62]:
Account_Table['ACCOUNT_TYPE'].value_counts()
Out[62]:
10    70696
6     18417
5     17331
0     10154
2      7300
1      5141
7      4833
51     1921
3      1047
32      419
31       73
9        46
Name: ACCOUNT_TYPE, dtype: int64
In [63]:
Account_Table['ACCOUNT_TYPE'] 
Out[63]:
0         10
1          2
2         10
3         10
4         10
          ..
137373    10
137374    10
137375    10
137376    10
137377    10
Name: ACCOUNT_TYPE, Length: 137378, dtype: object
In [64]:
Account_Table=Account_Table.drop(['seq'], axis = 1) 
In [65]:
Account_Table
Out[65]:
U_USER_ID XML_SEQ_NO OPEN_DATE DATE_CLOSED ACCOUNT_TYPE AMOUNT_PAST_DUE ACCOUNT_STATUS ACCOUNTHOLDER_TYPE_CODE HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT CURRENT_BALANCE Credit_Limit_Amount ACCOUNT_NUMBER
0 user_10 1 08-01-2019 NaN 10 0 Active Individual 20727 4193 20000 user_10_1
1 user_10 1 05-10-2015 27-10-2016 2 0 Closed Joint 1500000 0 NaN user_10_2
2 user_1023 1 15-07-2015 NaN 10 NaN Active Individual 63705 103 NaN user_1023_3
3 user_1023 1 20-08-2018 NaN 10 0 Active Individual 27042 16121 61000 user_1023_4
4 user_1023 1 06-12-2018 NaN 10 NaN Active Individual 7327 0 NaN user_1023_5
... ... ... ... ... ... ... ... ... ... ... ... ...
137373 user_99995 1 31-07-2012 NaN 10 0 Active Individual 125179 16942 126000 user_99995_137374
137374 user_99997 1 05-10-2006 NaN 10 0 Active Individual 107686 16290 102000 user_99997_137375
137375 user_99997 1 01-08-2012 29-08-2016 10 0 Closed Individual NaN 0 NaN user_99997_137376
137376 user_99997 1 30-11-2017 NaN 10 0 Active Individual 1536 -689 63000 user_99997_137377
137377 user_99997 1 30-09-2006 28-03-2011 10 0 Closed Individual 10395 0 NaN user_99997_137378

137378 rows × 12 columns

In [66]:
Account_Table['OPEN_DATE']= pd.to_datetime(Account_Table.OPEN_DATE, errors = 'coerce')
In [67]:
Account_Table['OPEN_DATE'].value_counts()
Out[67]:
2019-04-30    269
2019-02-28    246
2019-05-31    245
2018-05-31    244
2018-10-31    242
             ... 
2001-08-28      1
2005-06-28      1
1996-02-09      1
2004-02-25      1
2004-12-05      1
Name: OPEN_DATE, Length: 6031, dtype: int64
In [ ]:
 
In [68]:
Account_Table
Out[68]:
U_USER_ID XML_SEQ_NO OPEN_DATE DATE_CLOSED ACCOUNT_TYPE AMOUNT_PAST_DUE ACCOUNT_STATUS ACCOUNTHOLDER_TYPE_CODE HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT CURRENT_BALANCE Credit_Limit_Amount ACCOUNT_NUMBER
0 user_10 1 2019-08-01 NaN 10 0 Active Individual 20727 4193 20000 user_10_1
1 user_10 1 2015-05-10 27-10-2016 2 0 Closed Joint 1500000 0 NaN user_10_2
2 user_1023 1 2015-07-15 NaN 10 NaN Active Individual 63705 103 NaN user_1023_3
3 user_1023 1 2018-08-20 NaN 10 0 Active Individual 27042 16121 61000 user_1023_4
4 user_1023 1 2018-06-12 NaN 10 NaN Active Individual 7327 0 NaN user_1023_5
... ... ... ... ... ... ... ... ... ... ... ... ...
137373 user_99995 1 2012-07-31 NaN 10 0 Active Individual 125179 16942 126000 user_99995_137374
137374 user_99997 1 2006-05-10 NaN 10 0 Active Individual 107686 16290 102000 user_99997_137375
137375 user_99997 1 2012-01-08 29-08-2016 10 0 Closed Individual NaN 0 NaN user_99997_137376
137376 user_99997 1 2017-11-30 NaN 10 0 Active Individual 1536 -689 63000 user_99997_137377
137377 user_99997 1 2006-09-30 28-03-2011 10 0 Closed Individual 10395 0 NaN user_99997_137378

137378 rows × 12 columns

In [69]:
Account_Table
Out[69]:
U_USER_ID XML_SEQ_NO OPEN_DATE DATE_CLOSED ACCOUNT_TYPE AMOUNT_PAST_DUE ACCOUNT_STATUS ACCOUNTHOLDER_TYPE_CODE HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT CURRENT_BALANCE Credit_Limit_Amount ACCOUNT_NUMBER
0 user_10 1 2019-08-01 NaN 10 0 Active Individual 20727 4193 20000 user_10_1
1 user_10 1 2015-05-10 27-10-2016 2 0 Closed Joint 1500000 0 NaN user_10_2
2 user_1023 1 2015-07-15 NaN 10 NaN Active Individual 63705 103 NaN user_1023_3
3 user_1023 1 2018-08-20 NaN 10 0 Active Individual 27042 16121 61000 user_1023_4
4 user_1023 1 2018-06-12 NaN 10 NaN Active Individual 7327 0 NaN user_1023_5
... ... ... ... ... ... ... ... ... ... ... ... ...
137373 user_99995 1 2012-07-31 NaN 10 0 Active Individual 125179 16942 126000 user_99995_137374
137374 user_99997 1 2006-05-10 NaN 10 0 Active Individual 107686 16290 102000 user_99997_137375
137375 user_99997 1 2012-01-08 29-08-2016 10 0 Closed Individual NaN 0 NaN user_99997_137376
137376 user_99997 1 2017-11-30 NaN 10 0 Active Individual 1536 -689 63000 user_99997_137377
137377 user_99997 1 2006-09-30 28-03-2011 10 0 Closed Individual 10395 0 NaN user_99997_137378

137378 rows × 12 columns

In [70]:
Account_Table['DATE_CLOSED'] = Account_Table['DATE_CLOSED'].replace(np.nan, '01-01-1990')
Account_Table
Out[70]:
U_USER_ID XML_SEQ_NO OPEN_DATE DATE_CLOSED ACCOUNT_TYPE AMOUNT_PAST_DUE ACCOUNT_STATUS ACCOUNTHOLDER_TYPE_CODE HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT CURRENT_BALANCE Credit_Limit_Amount ACCOUNT_NUMBER
0 user_10 1 2019-08-01 01-01-1990 10 0 Active Individual 20727 4193 20000 user_10_1
1 user_10 1 2015-05-10 27-10-2016 2 0 Closed Joint 1500000 0 NaN user_10_2
2 user_1023 1 2015-07-15 01-01-1990 10 NaN Active Individual 63705 103 NaN user_1023_3
3 user_1023 1 2018-08-20 01-01-1990 10 0 Active Individual 27042 16121 61000 user_1023_4
4 user_1023 1 2018-06-12 01-01-1990 10 NaN Active Individual 7327 0 NaN user_1023_5
... ... ... ... ... ... ... ... ... ... ... ... ...
137373 user_99995 1 2012-07-31 01-01-1990 10 0 Active Individual 125179 16942 126000 user_99995_137374
137374 user_99997 1 2006-05-10 01-01-1990 10 0 Active Individual 107686 16290 102000 user_99997_137375
137375 user_99997 1 2012-01-08 29-08-2016 10 0 Closed Individual NaN 0 NaN user_99997_137376
137376 user_99997 1 2017-11-30 01-01-1990 10 0 Active Individual 1536 -689 63000 user_99997_137377
137377 user_99997 1 2006-09-30 28-03-2011 10 0 Closed Individual 10395 0 NaN user_99997_137378

137378 rows × 12 columns

In [71]:
Account_Table['DATE_CLOSED']= pd.to_datetime(Account_Table.DATE_CLOSED, errors = 'coerce')
In [72]:
Account_Table
Out[72]:
U_USER_ID XML_SEQ_NO OPEN_DATE DATE_CLOSED ACCOUNT_TYPE AMOUNT_PAST_DUE ACCOUNT_STATUS ACCOUNTHOLDER_TYPE_CODE HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT CURRENT_BALANCE Credit_Limit_Amount ACCOUNT_NUMBER
0 user_10 1 2019-08-01 1990-01-01 10 0 Active Individual 20727 4193 20000 user_10_1
1 user_10 1 2015-05-10 2016-10-27 2 0 Closed Joint 1500000 0 NaN user_10_2
2 user_1023 1 2015-07-15 1990-01-01 10 NaN Active Individual 63705 103 NaN user_1023_3
3 user_1023 1 2018-08-20 1990-01-01 10 0 Active Individual 27042 16121 61000 user_1023_4
4 user_1023 1 2018-06-12 1990-01-01 10 NaN Active Individual 7327 0 NaN user_1023_5
... ... ... ... ... ... ... ... ... ... ... ... ...
137373 user_99995 1 2012-07-31 1990-01-01 10 0 Active Individual 125179 16942 126000 user_99995_137374
137374 user_99997 1 2006-05-10 1990-01-01 10 0 Active Individual 107686 16290 102000 user_99997_137375
137375 user_99997 1 2012-01-08 2016-08-29 10 0 Closed Individual NaN 0 NaN user_99997_137376
137376 user_99997 1 2017-11-30 1990-01-01 10 0 Active Individual 1536 -689 63000 user_99997_137377
137377 user_99997 1 2006-09-30 2011-03-28 10 0 Closed Individual 10395 0 NaN user_99997_137378

137378 rows × 12 columns

In [73]:
Account_Table
Out[73]:
U_USER_ID XML_SEQ_NO OPEN_DATE DATE_CLOSED ACCOUNT_TYPE AMOUNT_PAST_DUE ACCOUNT_STATUS ACCOUNTHOLDER_TYPE_CODE HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT CURRENT_BALANCE Credit_Limit_Amount ACCOUNT_NUMBER
0 user_10 1 2019-08-01 1990-01-01 10 0 Active Individual 20727 4193 20000 user_10_1
1 user_10 1 2015-05-10 2016-10-27 2 0 Closed Joint 1500000 0 NaN user_10_2
2 user_1023 1 2015-07-15 1990-01-01 10 NaN Active Individual 63705 103 NaN user_1023_3
3 user_1023 1 2018-08-20 1990-01-01 10 0 Active Individual 27042 16121 61000 user_1023_4
4 user_1023 1 2018-06-12 1990-01-01 10 NaN Active Individual 7327 0 NaN user_1023_5
... ... ... ... ... ... ... ... ... ... ... ... ...
137373 user_99995 1 2012-07-31 1990-01-01 10 0 Active Individual 125179 16942 126000 user_99995_137374
137374 user_99997 1 2006-05-10 1990-01-01 10 0 Active Individual 107686 16290 102000 user_99997_137375
137375 user_99997 1 2012-01-08 2016-08-29 10 0 Closed Individual NaN 0 NaN user_99997_137376
137376 user_99997 1 2017-11-30 1990-01-01 10 0 Active Individual 1536 -689 63000 user_99997_137377
137377 user_99997 1 2006-09-30 2011-03-28 10 0 Closed Individual 10395 0 NaN user_99997_137378

137378 rows × 12 columns

In [74]:
Account_Table['OPEN_DATE'].value_counts()
Out[74]:
2019-04-30    269
2019-02-28    246
2019-05-31    245
2018-05-31    244
2018-10-31    242
             ... 
2001-08-28      1
2005-06-28      1
1996-02-09      1
2004-02-25      1
2004-12-05      1
Name: OPEN_DATE, Length: 6031, dtype: int64
In [75]:
Account_Table['AMOUNT_PAST_DUE'] = Account_Table['AMOUNT_PAST_DUE'].str.replace(' ', '')
In [76]:
Account_Table['AMOUNT_PAST_DUE'] .value_counts()
Out[76]:
0         94322
200          32
290          26
100          21
1            19
          ...  
113228        1
2158          1
4362          1
49977         1
147           1
Name: AMOUNT_PAST_DUE, Length: 1774, dtype: int64
In [77]:
Account_Table['AMOUNT_PAST_DUE'] = Account_Table['AMOUNT_PAST_DUE'].fillna(0)
Account_Table['HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT'] = Account_Table['HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT'].fillna(0)
Account_Table['CURRENT_BALANCE'] = Account_Table['CURRENT_BALANCE'].fillna(0)
Account_Table['Credit_Limit_Amount'] = Account_Table['Credit_Limit_Amount'].fillna(0)
In [78]:
Account_Table['DATE_CLOSED'].value_counts()
Out[78]:
1990-01-01    83537
2018-07-31      640
2018-08-26      229
2017-01-31      171
2018-12-17      147
              ...  
2008-12-22        1
2010-09-02        1
2004-10-13        1
2009-05-08        1
2011-02-07        1
Name: DATE_CLOSED, Length: 4187, dtype: int64
In [79]:
Account_Table
Out[79]:
U_USER_ID XML_SEQ_NO OPEN_DATE DATE_CLOSED ACCOUNT_TYPE AMOUNT_PAST_DUE ACCOUNT_STATUS ACCOUNTHOLDER_TYPE_CODE HIGHEST_CREDIT_OR_ORIGINAL_LOAN_AMOUNT CURRENT_BALANCE Credit_Limit_Amount ACCOUNT_NUMBER
0 user_10 1 2019-08-01 1990-01-01 10 0 Active Individual 20727 4193 20000 user_10_1
1 user_10 1 2015-05-10 2016-10-27 2 0 Closed Joint 1500000 0 0 user_10_2
2 user_1023 1 2015-07-15 1990-01-01 10 0 Active Individual 63705 103 0 user_1023_3
3 user_1023 1 2018-08-20 1990-01-01 10 0 Active Individual 27042 16121 61000 user_1023_4
4 user_1023 1 2018-06-12 1990-01-01 10 0 Active Individual 7327 0 0 user_1023_5
... ... ... ... ... ... ... ... ... ... ... ... ...
137373 user_99995 1 2012-07-31 1990-01-01 10 0 Active Individual 125179 16942 126000 user_99995_137374
137374 user_99997 1 2006-05-10 1990-01-01 10 0 Active Individual 107686 16290 102000 user_99997_137375
137375 user_99997 1 2012-01-08 2016-08-29 10 0 Closed Individual 0 0 0 user_99997_137376
137376 user_99997 1 2017-11-30 1990-01-01 10 0 Active Individual 1536 -689 63000 user_99997_137377
137377 user_99997 1 2006-09-30 2011-03-28 10 0 Closed Individual 10395 0 0 user_99997_137378

137378 rows × 12 columns

In [81]:
pp.ProfileReport(Account_Table)
Out[81]:

In [ ]:
 
In [ ]:
#Account_Table.to_csv(r'C:\Users\nitishkumar\Desktop\P\Cred\Data\Combined_Data\Final_converted_sheet_accounts\Cred_Account_Table_v5.csv', index = False)
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: